#!/usr/bin/env python
# coding: utf-8

import os
import sys, codecs

basedir = './'
dstdir = './'
infiles = [ basedir + 'ads_train.txt', basedir + 'ads_valid.txt', basedir + 'ads_test.txt' ]
outfiles = [ dstdir + 'ads_train_mz.txt', dstdir + 'ads_valid_mz.txt', dstdir + 'ads_test_mz.txt' ]

for idx,infile in enumerate(infiles):
    outfile = outfiles[idx]
    fout = codecs.open(outfile, 'w', encoding='utf-8')
    cnt_normal = 0
    cnt_err = 0
    for line in codecs.open(infile, 'r', encoding='utf-8'):
        r = line.strip().split('\t')
        if len(r) != 3:
            cnt_err += 1
            continue
            # raise 'len of r is not 3'
        if r[0].strip() == '' or r[1].strip() == '':
            cnt_err += 1
            continue
            # print r[1].encode('utf-8')
            # raise 'r is wrong'
        cnt_normal += 1
        fout.write('%s\t%s\t%s\n'%(r[2], r[0], r[1]))
    print "The count of the normal data is: ", cnt_normal, " and the error is: ", cnt_err
    fout.close()



